{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "7525e867",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "from sklearn.metrics import mean_squared_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "4c1fbc9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "7a70df86",
   "metadata": {},
   "outputs": [],
   "source": [
    "old_len = len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "f17cc325",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['duration'] = df.dropOff_datetime - df.pickup_datetime\n",
    "df['duration'] = df.duration.dt.total_seconds() / 60"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1b613836",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19.1672240937939"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.duration.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "3092ae43",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[(df.duration >= 1) & (df.duration <= 60)].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "2e1aa200",
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical = ['PUlocationID', 'DOlocationID']\n",
    "\n",
    "df[categorical] = df[categorical].fillna(-1).astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "22c6495b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[categorical] = df[categorical].astype('str')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "12fcfa04",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dicts = df[categorical].to_dict(orient='records')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "0606301f",
   "metadata": {},
   "outputs": [],
   "source": [
    "dv = DictVectorizer()\n",
    "X_train = dv.fit_transform(train_dicts) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "c81c6737",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1109826, 525)"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "aeaf7bad",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = df.duration.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "75199886",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "525"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dv.feature_names_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "4d7f1e4b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression()"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lr = LinearRegression()\n",
    "lr.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "d61318d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = lr.predict(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "1aa5f5c2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10.528519107212292"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mean_squared_error(y_train, y_pred, squared=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "41c08294",
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical = ['PUlocationID', 'DOlocationID']\n",
    "\n",
    "def read_data(filename):\n",
    "    df = pd.read_parquet(filename)\n",
    "    \n",
    "    df['duration'] = df.dropOff_datetime - df.pickup_datetime\n",
    "    df['duration'] = df.duration.dt.total_seconds() / 60\n",
    "\n",
    "    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()\n",
    "\n",
    "    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "4854399a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_val = read_data('./data/fhv_tripdata_2021-02.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "f9eea69f",
   "metadata": {},
   "outputs": [],
   "source": [
    "val_dicts = df_val[categorical].to_dict(orient='records')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "669fda0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_val = dv.transform(val_dicts) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "7f8f950d",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = lr.predict(X_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "8f129850",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_val = df_val.duration.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "8a277ab9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11.014283211122269"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mean_squared_error(y_val, y_pred, squared=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5108416",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
